from joblib import dump, load
import pandas as pd

# read in pulled 2014 data
eitc = pd.read_csv('/REDACTED/2014_full_pop_ind_return_data.csv')

# subset to important features used to train prediction model (that are present in our pull)
top_38_plus_taxpayer_id = [REDACTED. FEATURES AVAILABLE UPON REQUEST OF AUTHORS WITH APPROPRIATE CLEARANCE FROM IRS.]

eitc_restrict_cols = eitc[top_38_plus_taxpayer_id]

# read in dep_database data to grab other features from top 40
dep_database = pd.read_csv('/REDACTED/data/raw/receivedData/dep_database/dep_database_scored_TY2014_masked.csv')
dep_database.columns = dep_database.columns.str.lower()

# read in 2014 op data
individual_2014 = pd.read_csv('/REDACTED/data/final/individualBISG2014_full_final.csv')

# no duplicate taxpayer_ids in individual_2014
dep_database_merged = dep_database.merge(individual_2014, how = 'left', on = 'taxpayer_id') #dep_database is ~5.8 million, ~11k don't have new taxpayer_id

# merge everything together to get a superset of features
eitc_restrict_cols = eitc_restrict_cols.rename(columns={"taxpayer_id":"taxpayer_id_new"})
features = eitc_restrict_cols.merge(dep_database_merged, how = 'left', on = ['taxpayer_id_new'])
features = features.rename(columns={"irs_dep_risk_score_x": "irs_dep_risk_score"})
features = features.rename(columns={"eic_x": "eic"})
features = features.rename(columns={"filing_status_x": "filing_status"})

# following procedure in clean_data.py
features['irs_dep_risk_score'] = features['irs_dep_risk_score'].fillna(-15000)

# subset to top 40 features and taxpayer_ids
top_40_plus_taxpayer_id = [REDACTED. FEATURES AVAILABLE UPON REQUEST OF AUTHORS WITH APPROPRIATE CLEARANCE FROM IRS.]

features_restrict_cols = features[top_40_plus_taxpayer_id]
# fill NaN with zero
features_restrict_cols = features_restrict_cols.fillna(0)

# load in prediction model for each of the 5 folds
mdl_0 = load('/REDACTED/data/modeled_refactor_temp/top_40_features/EITC_NCMP_RF_Reg_plus_dep_database_train_set_0_outcome_ref_cred_amt_dif_pv.joblib')
mdl_1 = load('/REDACTED/data/modeled_refactor_temp/top_40_features/EITC_NCMP_RF_Reg_plus_dep_database_train_set_1_outcome_ref_cred_amt_dif_pv.joblib')
mdl_2 = load('/REDACTED/data/modeled_refactor_temp/top_40_features/EITC_NCMP_RF_Reg_plus_dep_database_train_set_2_outcome_ref_cred_amt_dif_pv.joblib')
mdl_3 = load('/REDACTED/data/modeled_refactor_temp/top_40_features/EITC_NCMP_RF_Reg_plus_dep_database_train_set_3_outcome_ref_cred_amt_dif_pv.joblib')
mdl_4 = load('/REDACTED/data/modeled_refactor_temp/top_40_features/EITC_NCMP_RF_Reg_plus_dep_database_train_set_4_outcome_ref_cred_amt_dif_pv.joblib')

# predict refundable credit overclaiming using each of the 5 models

# fold 0
features0 = features_restrict_cols
features0['sort_var'] = mdl_0.predict(features0.drop(columns=['taxpayer_id', 'taxpayer_id_new']))
preds_0 = features0
preds_0.sort_var.isnull().sum()

# fold 1
features_restrict_cols = features[top_40_plus_taxpayer_id]
features_restrict_cols = features_restrict_cols.fillna(0)
preds_1 = features_restrict_cols
preds_1['sort_var'] = mdl_1.predict(preds_1.drop(columns=['taxpayer_id', 'taxpayer_id_new']))
preds_1.sort_var.isnull().sum()

# fold 2
features_restrict_cols = features[top_40_plus_taxpayer_id]
features_restrict_cols = features_restrict_cols.fillna(0)
preds_2 = features_restrict_cols
preds_2['sort_var'] = mdl_2.predict(preds_2.drop(columns=['taxpayer_id', 'taxpayer_id_new']))
preds_2.sort_var.isnull().sum()

# fold 3
features_restrict_cols = features[top_40_plus_taxpayer_id]
features_restrict_cols = features_restrict_cols.fillna(0)
preds_3 = features_restrict_cols
preds_3['sort_var'] = mdl_3.predict(preds_3.drop(columns=['taxpayer_id', 'taxpayer_id_new']))
preds_3.sort_var.isnull().sum()

# fold 4
features_restrict_cols = features[top_40_plus_taxpayer_id]
features_restrict_cols = features_restrict_cols.fillna(0)
preds_4 = features_restrict_cols
preds_4['sort_var'] = mdl_4.predict(preds_4.drop(columns=['taxpayer_id', 'taxpayer_id_new']))
preds_4.sort_var.isnull().sum()

# for each set of predictions, create unique name for predictions and de-duplicate by the maximum prediction
preds_0 = preds_0.rename(columns={'sort_var':'sort_var_0'})
preds_0 = preds_0.sort_values('sort_var_0', ascending=False).drop_duplicates('taxpayer_id_new').sort_index()

preds_1 = preds_1.rename(columns={'sort_var':'sort_var_1'})
preds_1f = preds_1[['taxpayer_id_new', 'sort_var_1']]
preds_1f = preds_1f.sort_values('sort_var_1', ascending=False).drop_duplicates('taxpayer_id_new').sort_index()

preds_2 = preds_2.rename(columns={'sort_var':'sort_var_2'})
preds_2f = preds_2[['taxpayer_id_new', 'sort_var_2']]
preds_2f = preds_2f.sort_values('sort_var_2', ascending=False).drop_duplicates('taxpayer_id_new').sort_index()

preds_3 = preds_3.rename(columns={'sort_var':'sort_var_3'})
preds_3f = preds_3[['taxpayer_id_new', 'sort_var_3']]
preds_3f = preds_3f.sort_values('sort_var_3', ascending=False).drop_duplicates('taxpayer_id_new').sort_index()

preds_4 = preds_4.rename(columns={'sort_var':'sort_var_4'})
preds_4f = preds_4[['taxpayer_id_new', 'sort_var_4']]
preds_4f = preds_4f.sort_values('sort_var_4', ascending=False).drop_duplicates('taxpayer_id_new').sort_index()

# merge all 5 sets of predictions together
predictions_01 = preds_0.merge(preds_1f, how = 'left', on = 'taxpayer_id_new')
predictions_01.head()
predictions_01.sort_var_1.isnull().sum()

predictions_012 = predictions_01.merge(preds_2f, how = 'left', on = 'taxpayer_id_new')
predictions_012.head()
predictions_012.sort_var_2.isnull().sum()

predictions_0123 = predictions_012.merge(preds_3f, how = 'left', on = 'taxpayer_id_new')
predictions_0123.head()
predictions_0123.sort_var_3.isnull().sum()

predictions_01234 = predictions_0123.merge(preds_4f, how = 'left', on = 'taxpayer_id_new')
predictions_01234.head()
predictions_01234.sort_var_4.isnull().sum()

# create variable for average prediction
predictions_01234['sort_var_avg'] = (predictions_01234['sort_var_0'] + predictions_01234['sort_var_1'] + predictions_01234['sort_var_2'] + predictions_01234['sort_var_3'] + predictions_01234['sort_var_4']) / 5

# write out results
predictions_01234.to_csv('/REDACTED/eitc_predictions_all.csv', index=False)


